4 R and RJAGS

From the report introduction:

There are two advantages to our proposed system compared with the current workflow:

  1. Our system combines data from multiple sources into a statistical model that includes uncertainty using Bayesian statistics.
  2. The operator can interact with the internal model through Excel to conduct scenario analysis and automatically visualise the results.

— Logan

This Rmarkdown-generated page will serve as proof that a fully automated proof of concept has been developed. Whether the code is sufficiently commented or not … is a different question.

4.1 Setup

configpath = '../wairakei_data/config.xlsx'
regdatapath = '../wairakei_data/data.xlsx'
extraliqregpath = "../wairakei_data/extra_liq.csv" # for regression
extradatapath = "../wairakei_data/well_pi.csv"     # ts data
pipath <- "../wairakei_data/short version Generation Projection 2016.xlsx"

base_year = '2000'
prediction_date = '2017-12-01'
production_curve_wells = c('wk255', 'wk263')
tsplotwells = c("wk118", "wk216", "wk605")
decline_wells = c(production_curve_wells, "wk272", "wk86", "wk116")
base_datetime = as.POSIXct(paste(base_year, 1, 1, sep='-'))
today_datetime = as.POSIXct(prediction_date)
# theme_update(text=element_text(family="Times New Roman"))
'%ni%' <- Negate('%in%')
# for over-plotting
special_wells = c(production_curve_wells, tsplotwells, "wk86", "wk116")
use.censor = F

n_steps = 1000

censor = function(x, type) {
  # Hash the facility identifier (beware of hash clashes)
  if (!use.censor) {
    return(x)
  } else if (type=="well") {
    return(paste0("w", toupper(substr(sha1(x), 1, 3))))
  } else if (type=="fp") {
    return(paste0("fp", toupper(substr(sha1(x), 1, 2))))
  }
}

4.2 Data Handling

Data is extracted and cleaned using Python in simulation.ipynb. The Python notebook is also used to generate a rudimentary config file, but some things (network connectivity) are specified manually.

R is used to:

  • Read raw data and config from Excel/CSV files
  • Do additional pre-processing that depends on the data available
  • Censor sensitive facility names
  • Create a graph structure
  • Make the data into a JAGS-readable format

4.2.1 Load Data

Reads data from several spreadsheets, including PI data. PI data is special because it has not been pre-processed. It requires additional filtering and basic pre-processing.

# read in config data
configsheets = excel_sheets(configpath)
for (sheet in configsheets) {
  assign(sheet, read_excel(configpath, sheet))
}
stopifnot(!anyDuplicated(well_fp_map$well)) # each well cannot map to multiple flash plants

# read in PI data
PI <- read_excel(pipath, "From PI sheet", skip=1) %>%
  rename(facility = Unit,
         variable = X__1,
         id = X__2,
         description = X__3,
         code = X__4) %>%
  gather(key="datechar", value="value", -c(facility, variable, id, description, code)) %>%
  mutate(date = as.Date(as.numeric(datechar), origin = "1899-12-30"),
         value = as.numeric(value)) %>% select(-c(datechar, id)) %>%
  mutate_if(is.character, tolower) %>%
  mutate(value = as.numeric(value)) %>%
  drop_na(value) %>%
  filter(date >= as.Date("2017-11-01"), date < as.Date(prediction_date)) %>%
  filter(!str_detect(variable, "condition|calc")) %>%
  filter(str_detect(facility, "wk")) 
extra_liq <- PI %>%
  select(facility, date, variable, value) %>% 
  # filter(value>1e-4) %>%
  filter(str_detect(variable, "plot|phase|whp|flow")) %>%
  spread(key=variable, value=value) %>%
  mutate(mf = pmax(`2phase flow`, `fp14  plot flow`, `fp15  plot flow`, `flow`, na.rm=T),
         whp = pmax(`fp14  plot whp`, `fp15  plot whp`, `fp16  plot whp`, `whp`, na.rm=T),
         source = "PI Database") %>%
  select(well=facility, date, whp, mf, source) %>%
  drop_na()

# read in regression data (plus extra)
regression_df = read_excel(regdatapath) %>% mutate(source="Well Tests")
dry_df = PI %>%
  filter(str_detect(facility, "wk")) %>%
  select(facility, date, variable, value) %>%
  # filter(value>1e-2) %>%
  group_by(facility, date) %>%
  spread(key=variable, value=value) %>%
  select(facility, date, `ip sf`, `actual massflow`) %>%
  gather(key="key", value="mf", `ip sf`, `actual massflow`) %>%
  ungroup() %>%
  drop_na() %>%
  rename(well=facility)

4.2.2 Censor names

Censor well and flash plant names using a hash algorithm. Change the flag in setup to disable.

dry_df$well = censor(dry_df$well, "well")
extra_liq$well = censor(extra_liq$well, "well")
fp_constants$fp = censor(fp_constants$fp, "fp")
fp_gen_map$fp = censor(fp_gen_map$fp, "fp")
operating_conditions$well = censor(operating_conditions$well, "well")
regression_df$well = censor(regression_df$well, "well")
well_fp_map$well = censor(well_fp_map$well, "well")
well_fp_map$fp = censor(well_fp_map$fp, "fp")

production_curve_wells = censor(production_curve_wells, "well")
special_wells = censor(special_wells, "well")
tsplotwells = censor(tsplotwells, "well")

4.2.3 Preprocessing

Generate metadata, such as which wells have which data sources, and translate facility names into unique integer IDs. Also creates dummy facilities for multiple purposes.

# combine with extra
regression_df = plyr::rbind.fill(regression_df, extra_liq)

regression_df = regression_df %>%
  mutate(date_numeric = as.numeric(date - base_datetime)) %>%
  mutate(date_numeric=ifelse(date_numeric>0, date_numeric, NA))  # remove dates before baseline
dry_df = dry_df %>%
  filter(well %ni% unique(regression_df$well)) %>%
  mutate(date_numeric = as.numeric(as.POSIXct(date) - base_datetime)) %>%
  mutate(date_numeric=ifelse(date_numeric>0, date_numeric, NA))  # remove dates before baseline
well_fp_map = well_fp_map %>% select(well, fp) %>% drop_na()

# today_numeric = (Sys.time() - base_datetime) %>% as.numeric()
today_numeric = (today_datetime - base_datetime) %>% as.numeric()

# assign unique facility IDs
liq_wells = unique(regression_df$well) # aka production curve wells
dry_wells = unique(dry_df$well)        # aka time series wells
map_wells = unique(well_fp_map$well)   # any well mapped in config

well_names = unique(c(liq_wells, dry_wells))
fp_names = c(well_fp_map$fp, fp_gen_map$fp, fp_constants$fp) %>% unique()

fluid_types = c('ip', 'lp', 'w')
gen_names = gen_constants$gen %>% unique() %>% sort()
ip_gen_names = paste(gen_names, 'ip', sep='_')
lp_gen_names = paste(gen_names, 'lp', sep='_')
w_gen_names = paste(gen_names, 'w', sep='_')
dummy_gen_names = c(ip_gen_names, lp_gen_names, w_gen_names) %>% sort()
all_names = c('DUMMY', well_names, fp_names, dummy_gen_names, gen_names)
ids = 1:length(all_names)
names(ids) = all_names

# check data quality
no_data_wells = map_wells[!map_wells %in% c(liq_wells, dry_wells)]  # see which ones we're completely guessing for
no_map_wells = c(liq_wells, dry_wells)[!c(liq_wells, dry_wells) %in% map_wells]
missing = data.frame(Wells = c(paste(no_map_wells, collapse=", ")),
                     row.names = c("Data available but no FP"))
print(xtable(missing, type = "latex",
             caption=paste0("Potential data quality issues. ", names(ids)[71], " is known to be not connected, and ", names(ids)[31], " has an A/B pairing with ", names(ids)[32], "."),
             label="tab:quality"),
      file = "../_media/quality.tex")

# add names in data with IDs
regression_df = regression_df %>% mutate(well_id=ids[well])
dry_df = dry_df %>% mutate(well_id=ids[well])
operating_conditions = operating_conditions %>% mutate(well_id=ids[well]) %>% rename(whp_pred=whp)
fp_constants = fp_constants %>% mutate(fp_id=ids[fp])
gen_constants = gen_constants %>% mutate(gen_id=ids[gen]) %>% select(-gen)
well_fp_map = well_fp_map %>% mutate(well_id=ids[well], fp_id=ids[fp]) %>% select(-c(well, fp))
fp_gen_map = fp_gen_map %>% mutate(fp_id=ids[fp], gen_ip_id=ids[gen_ip], gen_lp_id=ids[gen_lp], gen_w_id=ids[gen_w]) %>% select(-c(fp, gen_ip, gen_lp, gen_w))

incomplete.fps = unique(well_fp_map %>%
  filter(is.na(well_id)) %>%
  mutate(fp = names(ids)[fp_id]) %>%
  pull(fp))

4.2.4 Graph

Work out which of the (now uniquely integer-identified) facilities flows to which. Then generates a graphic to check for correctness.

# create connectivity matrix. i flows to j
# wells to FPs
v = matrix(0, nrow=length(ids), ncol=length(ids))
v[1,-1] = 1
for (i in 1:nrow(well_fp_map)) {
  id_i = well_fp_map[[i, 1]]
  id_j = well_fp_map[[i, 2]]
  v[id_i, id_j] = 1
}
# send ip/lp/w flows to dummy gens
for (i in 1:nrow(fp_gen_map)) {
  id_i = fp_gen_map[[i, 1]]
  for (j in 2:ncol(fp_gen_map)) {
    facility_j = names(ids)[fp_gen_map[[i, j]]]
    facility_dummy_j = paste(facility_j, fluid_types[j-1], sep='_')
    id_j = ids[facility_dummy_j]
    if (!is.na(id_j)) {
      v[id_i, id_j] = 1
    }
  }
}
# dummy gens to gens
for (i in 1:nrow(gen_constants)) {
  id_j = gen_constants$gen_id[i]
  facility_j = names(ids)[id_j]
  for (fluid in fluid_types) {
    facility_dummy_i = paste(facility_j, fluid, sep='_')
    id_i = ids[facility_dummy_i]
    v[id_i, id_j] = 1
  }
}

# convert form
m = matrix(0, nrow=nrow(v), ncol=max(colSums(v)))
rownames(m) = all_names
for (i in 1:nrow(v)) {
  for (j in 1:ncol(v)) {
    if (v[[i, j]]==1) {
      m[j, sum(m[j,]>0)+1] = i
    }
  }
}
flows_to = function(well) {
  return(names(ids)[m[well,]][-1])
}

# generate coordinates
dummy_locs = data.frame(name='DUMMY', x=-0.1, y=0)
well_locs = data.frame(name=well_names, x=0, y=seq(1, 1/(length(well_names)-1), length.out=length(well_names)))
fp_locs = data.frame(name=fp_names, x=1, y=seq(0, 1, length.out=length(fp_names)))
gen_dummy_locs = data.frame(name=dummy_gen_names, x=2, y=seq(0, 1, length.out=length(dummy_gen_names)))
gen_locs = data.frame(name=gen_names, x=2.5, y=seq(1/11, 10/11, length.out=length(gen_names)))
locs = rbind(dummy_locs, well_locs, fp_locs, gen_dummy_locs, gen_locs)
locs$id = ids[locs$name]
locs = locs %>% arrange(id)

g = graph_from_adjacency_matrix(v) %>%
  set_vertex_attr('label', value=all_names) %>%
  set_vertex_attr('x', value=as.vector(locs$x)) %>%
  set_vertex_attr('y', value=as.vector(locs$y)) %>%
  set_vertex_attr('label.degree', value=pi) %>%
  as.undirected()
V(g)$size = ifelse(V(g)$label %in% well_names, 4, 8)
V(g)$color = ifelse(V(g)$label %in% dry_wells, "red", ifelse(V(g)$label %in% no_data_wells, "grey", "orange"))
E(g)$color = "black"
E(g)[which(tail_of(g, E(g))$label=="DUMMY")]$color = "grey"

# png("../_media/full_network.png")
# par(mar=c(0,3,0,0), family="Times")
# plot(g, vertex.label.dist=3,
#      mark.groups = list(wells=ids[well_names], fps=ids[fp_names], gens=ids[gen_names]),
#      mark.col = "#DDDDDD",
#      mark.border = NA)
# text(c(-1, -0.3, 0.4, 0.9), 1.15, c("Wells", "Flash plants", "Dummy gens", "Generators"), cex=1.25)
# dev.off()
plot(g, vertex.label.dist=3,
     mark.groups = list(wells=ids[well_names], fps=ids[fp_names], gens=ids[gen_names]),
     mark.col = "#DDDDDD",
     mark.border = NA)

The dummy node is necessary because when indexing a subset of flows that go into a node, this subset cannot be empty. The dummy node has zero mass flowing out of it.

4.2.5 Format Data

JAGS requires data to be real numbers, vectors or matrices in a named list. It can also impute NA values from a distribution. Data wrangling is a significant part of the work - potentially more than the actual model coding and the results analysis combined.

This code also centers some of the covariates so it does not have to be done in JAGS.

\[\begin{equation} x_\text{whp} \leftarrow x_\text{whp} - \overline{x_\text{whp}} \end{equation}\]
regression_list = regression_df %>% select(well_id, whp, mf, date_numeric) %>% as.list()
dry_list = dry_df %>%
  filter(date < prediction_date) %>%
  rename(well_id_dry=well_id, mf_dry=mf, date_numeric_dry=date_numeric) %>% # use these in a different regression
  select(well_id_dry, mf_dry, date_numeric_dry) %>% as.list()
operating_conditions_list = operating_conditions %>% arrange(well_id) %>% select(whp_pred) %>% as.list()
fp_constants_list = as.list(fp_constants)
gen_constants_list = as.list(gen_constants %>% select(gen_id, factor))
facilities = data.frame(id=ids) %>%
  left_join(operating_conditions %>% rename(id=well_id) %>% filter(id %in% ids) %>% select(-well), by='id') %>%
  left_join(gen_constants %>% select(factor, id=gen_id), by='id') %>%
  left_join(fp_constants %>% rename(id=fp_id), by='id') %>%
  filter(id %in% ids) %>%  # in case extras specified in data
  mutate(mf_pred=NA) %>%
  mutate(n_inflows=colSums(v))

well_ids = ids[well_names]
liq_well_ids = ids[liq_wells]
dry_well_ids = ids[dry_wells]
fp_ids = ids[fp_names]
ip_gen_ids = ids[ip_gen_names]
lp_gen_ids = ids[lp_gen_names]
w_gen_ids = ids[w_gen_names]
gen_ids = ids[gen_names]

# force all mass to IP steam
dry_fps = c("poi dry", "direct ip")
dry_fp_ids = ids[dry_fps]
facilities$hf_ip[facilities$id %in% dry_fp_ids] = 10
facilities$hfg_ip[facilities$id %in% dry_fp_ids] = 10
facilities_list = facilities %>% select(-id) %>% as.list()

# experimental TS data matrix for dry wells
ar_order = 1
empty = setNames(data.frame(matrix(ncol = length(all_names), nrow = 0)), all_names)
drymatrix = dry_df %>% 
  select(well, date_numeric, mf) %>% 
  spread(well, mf) %>% 
  select(-date_numeric)
drymatrix = empty %>%
  full_join(drymatrix) %>%
  as.matrix()
ar_well_ids = which(complete.cases(t(drymatrix[1:(ar_order+1),])))
ar_wells = names(ids)[ar_well_ids]
# which wells can we not use AR for
dry_no_ar_wells = dry_wells[!dry_well_ids %in% ar_well_ids]
dry_no_ar_well_ids = ids[dry_no_ar_wells]

# insert production curve predictions
stopifnot(all(tsplotwells %in% dry_df$well))
tsplotwells = ar_wells
days_since_last = as.integer(today_datetime - as.POSIXct(max(dry_df$date)))
prod = expand.grid(whp_prod=seq(6, 16, length.out=10),
                  well_id_prod=ids[production_curve_wells])
ts = expand.grid(date_numeric_ts=seq(min(dry_df$date_numeric), max(dry_df$date_numeric)+days_since_last, length.out=10),
                 well_id_ts=ids[tsplotwells])
prod_list = prod %>% as.list
ts_list = ts %>% as.list

# extend matrix for prediction
drymatrix = rbind(drymatrix, matrix(NA, nrow=days_since_last, ncol=ncol(drymatrix)))

# combine into one list
data = c(regression_list, dry_list, facilities_list, prod_list, ts_list,
         list(well_ids=well_ids, liq_well_ids=liq_well_ids, 
              dry_well_ids=dry_well_ids, dry_no_ar_well_ids=dry_no_ar_well_ids,
              fp_ids=fp_ids,
              gen_ids=gen_ids, ip_gen_ids=ip_gen_ids, lp_gen_ids=lp_gen_ids, w_gen_ids=w_gen_ids,
              today_numeric=today_numeric, m=m, dummy=1,
              ts=drymatrix, ts_ar=drymatrix, ts_ema=drymatrix, ar_well_ids=ar_well_ids))
# data$whp_pred[is.na(data$whp_pred)] <- mean(data$whp_pred, na.rm=T)

# center covariates
mean_whp <- mean(data$whp, na.rm=T)
mean_date_numeric <- mean(data$date_numeric, na.rm=T)

data$whp_c <- data$whp - mean_whp
data$whp_pred_c <- data$whp_pred - mean_whp
data$whp_prod_c <- data$whp_prod - mean_whp
data$date_numeric_c <- data$date_numeric - mean_date_numeric
data$today_numeric_c <- data$today_numeric - mean_date_numeric
data$date_numeric_dry_c <- data$date_numeric_dry - mean_date_numeric
data$date_numeric_ts <- data$date_numeric_ts - mean_date_numeric

pidataplot = ggplot(regression_df %>% filter(source=="PI Database"), aes(x=whp, y=mf, color=well)) +
  geom_point() +
  labs(title=paste("PI Regression Data from", min(extra_liq$date), "to", max(extra_liq$date)),
       x="Well-head pressure (bar)", 
       y="Mixed-phase mass flow (T/h)",
       color="Well") +
  guides(color=guide_legend(ncol=2)) +
  ggsave('../_media/pi_data.png', width=24.7, height=12, units='cm')
ggplotly(pidataplot)

4.3 Model

JAGS accepts a model in a text string. It uses an R-like syntax, but is a declarative language not sequential. We do basic manipulation of the output traces.

code = "
data {
  D <- dim(ts)
}
model {
  ##############################################
  # fit individual regressions to liquid wells #
  ##############################################
  for (i in 1:length(mf)) {
    mu[i] <- Intercept[well_id[i]] + beta_whp[well_id[i]] * whp_c[i] + beta_date[well_id[i]] * date_numeric_c[i]
    mf[i] ~ dnorm(mu[i], tau[well_id[i]])
    mf_fit[i] ~ dnorm(mu[i], tau[well_id[i]])
    # mf_fit[i] ~ dnorm(mu[i]*measurement_error_factor[i], tau[well_id[i]])
    # measurement_error_factor[i] ~ dunif(0.9, 1.1)
  }
  # fit regression to dry wells
  for (i in 1:length(mf_dry)) {
    mu_dry[i] <- Intercept[well_id_dry[i]] + beta_date[well_id_dry[i]] * date_numeric_dry_c[i]
    mf_dry[i] ~ dnorm(mu_dry[i], tau[well_id_dry[i]])
    mf_dry_fit[i] ~ dnorm(mu_dry[i], tau[well_id_dry[i]])
    # measurement_error_factor_dry[i] ~ dunif(0.9, 1.1)
  }
  for (j in dry_well_ids) {
    Intercept[j] ~ dnorm(0, 1e-12)
    beta_date[j] ~ dnorm(0, 1e-12)
    tau[j] ~ dgamma(1e-12, 1e-12)
  }
  # experimental AR1 model for dry wells
  for (j in ar_well_ids) {
    for (t in 2:D[1]) {
      mu_ar[t,j] <- c_ar[j] + theta_ar[j]*ts_ar[t-1,j]
      ts_ar[t,j] ~ dnorm(mu_ar[t,j], tau_ar[j]) T(0,)
    }
    theta_ar[j] ~ dnorm(0, 1e-12)
    c_ar[j] ~ dnorm(0, 1e-12)
    tau_ar[j] ~ dgamma(1e-12, 1e-12)
  }
  # experimental EWMA model (use at your own risk)
  for (j in ar_well_ids) {
    for (t in 2:D[1]) {
      mu_ema[t,j] <- alpha*mu_ema[t-1,j] + (1-alpha)*ts_ema[t,j]
      ts_ema[t,j] ~ dnorm(mu_ema[t-1,j], tau_ema[j]) T(0,)
    }
    mu_ema[1,j] <- ts_ema[1,j]
    theta_ema[j] ~ dnorm(0, 1e-12)
    c_ema[j] ~ dnorm(0, 1e-12)
    tau_ema[j] ~ dgamma(1e-12, 1e-12)
  }
  alpha ~ dbeta(0.5, 0.5)

  # HIERARCHICAL
  # fills in for any missing wells
  for (j in liq_well_ids) {
    Intercept[j] ~ dnorm(mu_Intercept, tau_Intercept)
    beta_whp[j] ~ dnorm(mu_beta_whp, tau_beta_whp)
    # beta_whp2[j] ~ dnorm(mu_beta_whp2, tau_beta_whp2)
    beta_date[j] ~ dnorm(mu_beta_date, tau_beta_date)
    tau[j] ~ dgamma(1e-12, 1e-12)
    sd[j] <- 1/max(sqrt(tau[j]), 1e-12)
  }

  # fill in any missing data
  for (i in 1:length(mf)) {
    date_numeric_c[i] ~ dnorm(mu_date_numeric, tau_date_numeric)
  }
  mu_date_numeric ~ dnorm(0, 1e-12)
  tau_date_numeric ~ dnorm(1e-12, 1e-12)
  
  # set hyperparameters
  mu_Intercept ~ dnorm(0, 1e-12)
  mu_beta_whp ~ dnorm(0, 1e-12)
  # mu_beta_whp2 ~ dnorm(0, 1e-12)
  mu_beta_date ~ dnorm(0, 1e-12)
  tau_Intercept ~ dgamma(1e-12, 1e-12)
  tau_beta_whp ~ dgamma(1e-12, 1e-12)
  # tau_beta_whp2 ~ dgamma(1e-12, 1e-12)
  tau_beta_date ~ dgamma(1e-12, 1e-12)

  #####################################
  # production curve for verification #
  #####################################
  for (i in 1:length(whp_prod)) {
    mu_prod[i] <- Intercept[well_id_prod[i]] + beta_whp[well_id_prod[i]] * whp_prod_c[i] + beta_date[well_id_prod[i]] * today_numeric_c
    # mf_prod[i] ~ dnorm(mu_prod[i], tau[well_id_prod[i]])
    mf_prod[i] <- mu_prod[i]
  }
  for (i in 1:length(date_numeric_ts)) {
    mu_ts[i] <- Intercept[well_id_ts[i]] + beta_date[well_id_ts[i]] * date_numeric_ts[i]
    mf_ts[i] ~ dnorm(mu_ts[i], tau[well_id_ts[i]])
  }

  #########################################################
  # simple model to fill in missing FP enthalpy constants #
  #########################################################
  for (i in fp_ids) {
    # missing fp constants
    hf_ip[i] ~ dgamma(param[1], param[7])
    hg_ip[i] ~ dgamma(param[2], param[8])
    hfg_ip[i] ~ dgamma(param[3], param[9])
    hf_lp[i] ~ dgamma(param[4], param[10])
    hg_lp[i] ~ dgamma(param[5], param[11])
    hfg_lp[i] ~ dgamma(param[6], param[12])
  }
  for (i in c(1, well_ids)) { 
    h[i] ~ dgamma(param[13], param[14])
    whp_pred_c[i] ~ dnorm(param[15], param[16])
  } # missing well constants
  for (i in 1:16) { param[i] ~ dgamma(1e-12, 1e-12) }               # uniform priors

  ########################################
  # make predictions (the stuff we want) #
  ########################################
  mf_pred[dummy] <- 0  # dummy well
  ip_sf[dummy] <- 0
  lp_sf[dummy] <- 0
  wf[dummy] <- 0
  
  # use production curve
  for (j in liq_well_ids) {
    mf_pred[j] <- max(Intercept[j] + beta_whp[j] * whp_pred_c[j] + beta_date[j] * today_numeric_c, 0)
  }
  # use naive TS reg
  for (j in dry_well_ids) { #dry_no_ar_well_ids) {
    mf_pred[j] <- max(Intercept[j] + beta_date[j] * today_numeric_c, 0)
  }
  # use AR(1)
  # for (j in ar_well_ids) {
  #   mf_pred[j] <- mu_ar[D[1], j]
  # }

  for (i in fp_ids) {
    mf_pred[i] <- sum(mf_pred[m[i,1:n_inflows[i]]])
    h[i] <- sum(mf_pred[m[i, 1:n_inflows[i]]] * h[m[i, 1:n_inflows[i]]]) / ifelse(mf_pred[i]!=0, mf_pred[i], 1)

    ip_sf[i] <- min(max((h[i] - hf_ip[i]), 0) / hfg_ip[i], 1) * mf_pred[i]
    lp_sf[i] <- min(max((min(hf_ip[i], h[i]) - hf_lp[i]), 0) / hfg_lp[i], 1) * (mf_pred[i] - ip_sf[i])

    total_sf[i] <- ip_sf[i] + lp_sf[i]
    wf[i] <- mf_pred[i] - total_sf[i]
  }
  # dummy gens and actual gens
  for (i in ip_gen_ids) { mf_pred[i] <- sum(ip_sf[m[i, 1:n_inflows[i]]]) }
  for (i in lp_gen_ids) { mf_pred[i] <- sum(lp_sf[m[i, 1:n_inflows[i]]]) }
  for (i in w_gen_ids) { mf_pred[i] <- sum(wf[m[i, 1:n_inflows[i]]]) }
  for (i in gen_ids) {
    mf_pred[i] <- sum(mf_pred[m[i,1:n_inflows[i]]])
    power[i] <- mf_pred[i] / mu_factor[i]
    mu_factor[i] ~ dunif(0.95*factor[i], 1.05*factor[i])  # uncertainty from email
  }
  total_power <- sum(power[gen_ids])
}
"
# cat(code, file="model.txt")

vars =  c('mf_fit',
          'mf_dry_fit',
          'mf_ts',
          'mf_prod',
          'mf_pred',
          'beta_date',
          'sd',
          'power',
          'total_sf',
          'mu_ar',
          'ts_ar',
          'mu_ema',
          'ts_ema',
          'alpha',
          'ip_sf',
          'lp_sf',
          'wf',
          paste0('h[', fp_ids, ']'),
          paste0('mu_', c('Intercept', 'beta_whp', 'beta_date')),
          'total_power')
n_chains = 2
burn_in = 100

model = jags.model(textConnection(code), data, n.chains=n_chains)
## Compiling data graph
##    Resolving undeclared variables
##    Allocating nodes
##    Initializing
##    Reading data back into data table
## Compiling model graph
##    Resolving undeclared variables
##    Allocating nodes
## Graph information:
##    Observed stochastic nodes: 6259
##    Unobserved stochastic nodes: 3916
##    Total graph size: 29435
## 
## Initializing model
update(model, burn_in)
out = coda.samples(model, n.iter=round(n_steps/n_chains), variable.names=vars)
outmatrix = as.matrix(out)
outframe = as.data.frame(outmatrix) %>%
  gather(key=facility, value=value) %>%
  mutate(variable=gsub("\\[.*$", "", facility), facility=parse_number(facility, na=c("NA")))
outframe$facility = factor(names(ids)[outframe$facility])

4.4 Convergence Tests

One of the difficulties with MCMC approximations is they often require a burn-in (warm-up) period before settling into the stationary distribution of the Markov chain. Only the stationary distribution corresponds to the joint distribution we wish to sample from. In most practical uses, there is no way to predict convergence, so we diagnose convergence by monitoring the sample trace and running diagnostic tests.

4.4.1 Trace plots

Poor convergence or mixing is indicated by a strong trend at the beginning of the trace plot.

trace1 <- outframe %>%
  filter(variable=='mf_pred', facility==censor('wk256', "well")) %>%
  mutate(index = 1:nrow(.))
trace2 <- outframe %>%
  filter(variable=='total_power') %>%
  mutate(index = 1:nrow(.))
trace3 <- outframe %>%
  filter(variable=='mu_Intercept') %>%
  mutate(index = 1:nrow(.))
traceplot = ggplot(trace1, aes(x=index, y=value, color=variable)) +
  geom_line(alpha=0.75) +
  geom_line(alpha=0.75) +
  geom_line(alpha=0.75) +
  coord_cartesian(xlim = c(max(trace1$index)-1000, max(trace1$index))) +
  labs(title="Trace Plot (Single chain)", x="Iteration", y="Parameter value", color="Variable")# +
  # ggsave('../_media/trace_plot.png', width=24.7, height=8, units='cm')
ggplotly(traceplot)

4.4.2 Geweke

Geweke’s convergence diagnostic for MCMC samples tests for equality of the means in the first 10% and last 50% of the trace. The means will be equal if the sample is drawn from a stationary distribution, indicating the burn-in period has been successfully excluded.

If true univariate convergence has been achieved, we expect 95% of variables to pass Geweke’s test with a z-score less than 1.96 with 95% confidence.

random_var_ix = sample.int(ncol(outmatrix), 100) # 100 random var because it takes too long
geweke.out = geweke.diag(out[,random_var_ix])
geweke.df = data.frame(Index = 1:length(unlist(geweke.out)),
                       z = unlist(geweke.out[1])) %>%
  mutate(out = ifelse(abs(z)>1.96, T, F)) %>%
  drop_na()
proportion_out = sum(geweke.df$out) / nrow(geweke.df)
gewekeplot = ggplot(geweke.df, aes(x=Index, y=z)) +
  geom_point() +
  geom_hline(data=data.frame(value=c(1.96,-1.96)), aes(yintercept=value), color='red') +
  labs(title=paste0("Geweke z-score. ", round(proportion_out, 2)*100, "% of points lie outside the 95% confidence interval."))# +
  # ggsave('../_media/geweke.png', width=24.7, height=6, units='cm')
ggplotly(gewekeplot)

4.4.3 Gelman

The Gelman-Rubin convergence diagnostic gives the potential scale reduction factor (PSRF) for each parameter. This requires at least two independent chains and tests whether the chains have converged to identical distributions. If the chains have not converged, the scale reduction factors will have upper confidence limits greater than one. It is possible that when run indefinitely, the variance of the parameter estimate could shrink by the PSRF.

gelman.out = gelman.diag(out[,c(paste0('mf_pred[', 8:9, ']'), 'beta_date[9]', 'mu_beta_whp', 'mu_beta_date', 'mu_Intercept', 'total_power')])[[1]] %>% as.data.frame()
kable(gelman.out) %>% kable_styling()
Point est. Upper C.I.
mf_pred[8] 1.0021015 1.014338
mf_pred[9] 0.9995892 1.001750
beta_date[9] 1.0649835 1.263674
mu_beta_whp 1.0463801 1.195384
mu_beta_date 1.1455117 1.518625
mu_Intercept 1.0180951 1.051931
total_power 1.0796053 1.170108

Some of the upper CIs are slightly greater than one, but not significantly. Large PSRFs are acceptable if they are in components of the network that do not affect parameters of interest.

4.4.4 Raftery

Raftery’s diagnostic gives the number of samples required to estimate a quantile (or credible interval) to a certain accuracy. In this notebook we only run 1000 samples so it says we do not have enough.

raftery.out = raftery.diag(out[,c(paste0('mf_pred[', 8:9, ']'), 'beta_date[9]', 'mu_beta_whp', 'mu_beta_date', 'mu_Intercept', 'total_power')])
raftery.out[[1]]
## 
## Quantile (q) = 0.025
## Accuracy (r) = +/- 0.005
## Probability (s) = 0.95 
## 
## You need a sample size of at least 3746 with these values of q, r and s

4.5 Posteriors

We generate density plots in their most basic forms without post-processing.

4.5.1 Well Mass Flow

g1 = ggplot(outframe %>% 
              filter(facility %in% well_names, variable=="mf_pred", value>0) %>%
              mutate(source = ifelse(facility %in% dry_wells, "PI time series", "Production curve")), 
            aes(x=value, fill=facility)) +
  geom_density(aes(y=..scaled..), alpha=0.5, color=NA) + xlim(0, NA) +
  facet_grid(source~.) +
  theme(axis.text.y=element_blank(),
        axis.ticks.y=element_blank()) +
  labs(title=paste("Posterior Well Mass Flows for", prediction_date), x="Mass flow (T/h)", y="Scaled density", fill="Facility")# +
  # ggsave('../_media/mf_wells.png', width=24.7, height=8, units='cm')
ggplotly(g1, tooltip=c('facility', 'value'))

4.5.2 Decline Rate

An operator might like to see which wells are declining the fastest.

g2 = ggplot(outframe %>% filter(variable=="beta_date", facility %in% special_wells), aes(x=value, fill=facility)) +
  geom_density(alpha=0.5, color=NA) +labs(title="Posterior Decline Rate of Test Data", x="beta_date (T/h/Bar)", y="Density", fill="Facility") +
  theme(axis.text.y=element_blank(),
        axis.ticks.y=element_blank())# +
  # ggsave('../_media/beta_date.png', width=24.7, height=6, units='cm')
ggplotly(g2, tooltip=c('facility', 'value'))

4.5.3 FP Mass Flow

g4 = ggplot(outframe %>% filter(facility %in% gen_names, variable=="mf_pred", value>0), aes(x=value, fill=facility)) +
  geom_density(aes(y=..scaled..), alpha=0.5, color=NA) + xlim(0, NA) + 
  theme(axis.text.y=element_blank(),
        axis.ticks.y=element_blank()) +
  labs(title=paste("Posterior Generator Values for", prediction_date), x="Mass flow (T/h)", y="Scaled density", fill="Facility")# +
  # ggsave('../_media/mf_gens.png', width=24.7, height=10, units='cm')
ggplotly(g4, tooltip=c('facility', 'value'))

4.5.4 Gen Mass Flow

g5.actual = data.frame(facility = c("WRK", "THI", "POI", "BIN"),
                       value = c(121.73567, 172.18096, 51.53028, 9.98687))
g5 = ggplot(outframe %>% filter(facility %in% gen_names, variable=="power", value>0), aes(x=value, fill=facility)) +
  geom_density(aes(y=..scaled..), alpha=0.5, color=NA) + xlim(0, NA) +
  geom_vline(data=g5.actual, aes(xintercept=value, color=facility)) +
  theme(axis.text.y=element_blank(),
        axis.ticks.y=element_blank()) +
  labs(x="Power (MW)", y="Scaled density", fill="Facility")# +
  # ggsave('../_media/power_gens.png', width=24.7, height=10, units='cm')

ggplotly(g5, tooltip=c('facility', 'value'))
# tsgrob4.5 = grid_arrange_shared_legend(g4, g5, nrow=2, ncol=1, position = "right")
# ggsave('../_media/gens.png', tsgrob4.5, width=24.7, height=6, units='cm')

4.5.5 Gen Power

tb6 <- outframe %>% filter(variable=="sd") %>% select(facility, value) %>%
  mutate(well=factor(facility)) %>%
  group_by(well) %>%
  summarise(Mean = mean(value), 
            `Lower 2.5%` = quantile(value, 0.025), 
            `Upper 97.5%` = quantile(value, 0.975)) %>%
  mutate_if(is.numeric, round, 3) %>%
  inner_join(regression_df %>% mutate(well=factor(names(ids)[well_id])) %>% group_by(well) %>% summarise(n=n()), by="well")
g6 = ggplot(outframe %>% filter(variable=="sd") %>% filter(facility %in% special_wells), aes(x=value, fill=facility)) +
  geom_density(alpha=0.5, color=NA) + coord_cartesian(xlim=c(0, max(tb6$`Upper 97.5%`))) +
  theme(axis.text.y=element_blank(),
        axis.ticks.y=element_blank()) +
  labs(title="Posterior Flow Deviation Estimates", x="Standard deviation", y="Density", fill="Facility")# +
  # ggsave('../_media/standard_deviation.png', width=24.7, height=10, units='cm')
ggplotly(g6, tooltip=c('facility', 'value'))

4.6 Advanced Analysis

4.6.1 High Variance wells

nrow.source = function(df, facilityname, sourcename) {
  stopifnot(length(sourcename)==1)
  return(nrow(df %>% filter(well==facilityname, source==sourcename)))
}
well_summaries = outframe %>%
  filter(facility %in% well_names, variable=="mf_pred") %>%
  group_by(facility) %>%
  summarise(mean = mean(value),
            sd = sd(value),
            n_test = nrow.source(regression_df, unique(facility),"Well Tests"),
            n_pi = nrow.source(regression_df, unique(facility), "PI Database"),
            use.test = ifelse(n_test>0, "Test data", "No test data"),
            use.pi = ifelse(n_pi>0, "PI data", "No PI data")) %>%
  arrange(desc(sd))
well_summaries$production.curve = ifelse(well_summaries$facility %in% liq_wells, "Production curve", "Time series")

fp_summaries = list(fp14 = well_summaries %>% filter(facility %in% flows_to(censor('fp14', 'fp'))),
                    fp15 = well_summaries %>% filter(facility %in% flows_to(censor('fp15', 'fp'))),
                    fp16 = well_summaries %>% filter(facility %in% flows_to(censor('fp16', 'fp'))))
for (fp in names(fp_summaries)) {
  print(xtable(fp_summaries[[fp]] %>% select(-c(use.test, use.pi, production.curve)),
               type = "latex",
               caption=paste("Data methods feeding flash plant", censor(fp, 'fp')),
               label=paste0("tab:well_summaries_", fp)),
      table.placement = "H",
      file = paste0("../_media/summaries_", fp, ".tex"))
}
fp_summaries %>% kable() %>% kable_styling() %>% scroll_box(height = "400px")
facility mean sd n_test n_pi use.test use.pi production.curve
wk242 322.13652 47.2385211 28 0 Test data No PI data Production curve
wk264 394.20917 17.2018634 34 30 Test data PI data Production curve
wk245 420.04092 15.0713041 41 30 Test data PI data Production curve
wk265 334.15445 14.4018421 34 30 Test data PI data Production curve
wk243 375.43871 8.3364384 52 30 Test data PI data Production curve
wk222 106.31548 8.2999560 44 0 Test data No PI data Production curve
wk263 221.33877 3.2113852 34 30 Test data PI data Production curve
wk237 14.63283 1.0950901 0 30 No test data PI data Production curve
wk233 13.56930 0.6175667 0 30 No test data PI data Production curve
wk228 14.98824 0.1108668 0 30 No test data PI data Production curve
facility mean sd n_test n_pi use.test use.pi production.curve
wk256 218.34271 18.4374652 32 30 Test data PI data Production curve
wk268 87.26872 17.7147487 27 30 Test data PI data Production curve
wk255 378.43331 13.9252038 41 30 Test data PI data Production curve
wk267 307.33283 13.2077826 31 30 Test data PI data Production curve
wk269 132.74935 11.8402173 25 30 Test data PI data Production curve
wk250 25.05857 7.3247934 0 30 No test data PI data Production curve
wk247 233.26984 6.4723543 48 30 Test data PI data Production curve
wk272 207.48286 2.8329014 7 30 Test data PI data Production curve
wk241 45.77379 0.4400225 0 30 No test data PI data Production curve
wk238 60.20425 0.3770650 0 30 No test data PI data Production curve
wk251 22.60754 0.1940089 0 30 No test data PI data Production curve
wk234 30.77341 0.1904144 0 30 No test data PI data Production curve
wk240 23.41470 0.1603002 0 30 No test data PI data Production curve
wk252 58.88827 0.0628887 0 30 No test data PI data Production curve
facility mean sd n_test n_pi use.test use.pi production.curve
wk266 322.97855 16.788105 29 30 Test data PI data Production curve
wk262 632.61082 9.718498 36 30 Test data PI data Production curve
wk271 85.91758 6.376994 6 30 Test data PI data Production curve
wk260 296.04027 4.768856 33 30 Test data PI data Production curve
wk253 269.73092 4.695968 32 30 Test data PI data Production curve
wk261 250.40980 4.569097 37 30 Test data PI data Production curve
wk270 460.58794 4.185943 5 30 Test data PI data Production curve
wk258 198.66793 3.437390 38 30 Test data PI data Production curve
wk259 308.93622 3.220008 37 30 Test data PI data Production curve
wk254 226.69230 2.425235 39 30 Test data PI data Production curve
n_summaries = well_summaries %>%
  group_by(use.pi, use.test) %>%
  count()

sourceplot = ggplot(well_summaries, aes(x=1, y=log(sd))) +
  geom_boxplot(fill='steelblue') +
  geom_label(data=n_summaries, aes(x=-Inf, y=-Inf, hjust=0, vjust=0, label=paste0("n=", n), family="Times New Roman"), label.size=0, fill='white') +
  facet_grid(.~ use.pi + use.test) +
  theme(axis.text.x=element_blank(),
        axis.ticks.x=element_blank()) +
  labs(title="Differences in Production Error by Data Source", x="Production curve data source", y="log(standard deviation)")# +
  # ggsave('../_media/error_source.png', width=24.7*0.5, height=6, units='cm')
ggplotly(sourceplot)
sourcetab = well_summaries %>% select(facility, mean, sd, n_test, n_pi)
# print(xtable(sourcetab %>% head(), type = "latex",
#              caption="Upon inspection of the wells with the most variance, there is no immediate cause for high variance. This requires further investigation.",
#              label="tab:well_summaries"),
#       table.placement = "h",
#       file = "../_media/well_summaries.tex")
kable(cbind(sourcetab)) %>%
  kable_styling() %>%
  scroll_box(height = "400px")
facility mean sd n_test n_pi
wk86 83.8460372 68.2447659 4 0
wk242 322.1365192 47.2385211 28 0
wk28 87.3743099 45.7907525 26 0
wk27 164.9209111 36.8377389 36 0
wk59 115.4227252 33.3333745 31 0
wk65 26.9289968 32.7253182 2 0
wk26b 111.7886843 27.2720525 13 0
wk116 74.0301302 26.4451621 10 0
wk81 61.0126780 25.1791016 24 0
wk123 185.1815737 23.3284710 31 0
wk72 152.2381236 21.6992865 26 0
wk76 124.6642094 21.5574617 31 0
wk67 93.3161893 21.2510659 26 0
wk96 28.4474001 21.2088843 10 0
wk71 128.2472765 19.0542452 32 0
wk256 218.3427094 18.4374652 32 30
wk268 87.2687212 17.7147487 27 30
wk264 394.2091707 17.2018634 34 30
wk266 322.9785462 16.7881046 29 30
wk26a 118.8068633 15.1818908 16 0
wk245 420.0409217 15.0713041 41 30
wk265 334.1544486 14.4018421 34 30
wk83 134.4234464 14.0279330 26 0
wk255 378.4333068 13.9252038 41 30
wk267 307.3328349 13.2077826 31 30
wk74 157.2891438 12.5518300 26 0
wk269 132.7493509 11.8402173 25 30
wk55 61.9725023 10.1604612 47 0
wk235 206.6448877 9.8779687 20 0
wk262 632.6108220 9.7184979 36 30
wk124 253.1097726 9.3357234 31 0
wk244 196.2798991 9.2025407 37 30
wk70 150.8437133 9.0633400 45 0
wk243 375.4387090 8.3364384 52 30
wk229 203.8152946 8.3261616 92 0
wk222 106.3154780 8.2999560 44 0
wk250 25.0585723 7.3247934 0 30
wk207 149.6589830 6.6177060 18 0
wk247 233.2698354 6.4723543 48 30
wk271 85.9175778 6.3769942 6 30
wk239 133.1476594 5.8199382 18 0
wk46 44.0406697 5.0652279 18 0
wk260 296.0402686 4.7688561 33 30
wk253 269.7309182 4.6959675 32 30
wk261 250.4098049 4.5690972 37 30
wk270 460.5879440 4.1859435 5 30
wk258 198.6679295 3.4373901 38 30
wk259 308.9362209 3.2200080 37 30
wk263 221.3387688 3.2113852 34 30
wk272 207.4828563 2.8329014 7 30
wk254 226.6923042 2.4252352 39 30
wk605 36.0438758 1.2774026 0 0
wk237 14.6328286 1.0950901 0 30
wk606 22.0390041 0.8709535 0 0
wk233 13.5693018 0.6175667 0 30
wk241 45.7737888 0.4400225 0 30
wk238 60.2042520 0.3770650 0 30
wk249 1.0435145 0.3707624 0 0
wk607 1.5229216 0.2727646 0 0
wk610 6.7454739 0.2520371 0 0
wk25 6.7978269 0.2043194 0 0
wk251 22.6075442 0.1940089 0 30
wk234 30.7734124 0.1904144 0 30
wk240 23.4146964 0.1603002 0 30
wk232 0.0940381 0.1111293 0 30
wk228 14.9882370 0.1108668 0 30
wk236 26.3413685 0.0845945 0 0
wk216 8.0725978 0.0829701 0 0
wk252 58.8882661 0.0628887 0 30
wk118 9.5151455 0.0332561 0 0
wk604 0.7825579 0.0311954 0 0
wk101 0.0030795 0.0021309 0 0
wk92 0.0012277 0.0009114 0 0
wk66 0.0000000 0.0000000 0 0
wk88 0.0000000 0.0000000 0 0

4.6.2 Regression Fits

prod = as.data.frame(outmatrix) %>%
  select(contains('prod')) %>%
  gather(key=facility, value=value) %>%
  mutate(which=parse_number(facility)) %>%
  mutate(whp=data$whp_prod[which],
         well = names(ids)[data$well_id_prod[which]]) %>%
  rename(mf=value) %>%
  group_by(well, whp) %>%
  summarise(lower=quantile(mf, 0.025),
            upper=quantile(mf, 0.975),
            mean=mean(mf))

plotdata = regression_df %>%
  filter(well_id %in% ids[production_curve_wells]) %>%
  mutate(datetime = factor(as.Date(date))) %>%
  mutate(source = factor(source, levels=c("Well Tests", "PI Database")))

# regression plot
regplot = ggplot(prod, aes(x=whp)) +
  geom_line(aes(y=mean, color=well)) +
  geom_ribbon(aes(ymin=lower, ymax=upper, fill=well), alpha=0.25) +
  geom_point(data=plotdata, aes(y=mf, color=well, size=date, shape=source), alpha=0.5) +
  labs(title="Linear Regression on Test and PI Data", x="Well-head pressure (bar)", y="Mass flow (T/h)", color="Well", shape="Data source", size="Date", fill="Well") +
  coord_cartesian(xlim=c(min(plotdata$whp)*0.9,max(plotdata$whp)*1.1), ylim=c(0,max(plotdata$mf)*1.1))# +
  # ggsave('../_media/production_curve.png', width=24.7*0.48, height=24.7*0.48, units='cm')
ggplotly(regplot)

4.6.3 Time Series Plots

tsplotwells = ar_wells
ts_fit = as.data.frame(outmatrix) %>%
  select(contains('mf_ts')) %>%
  gather() %>%
  mutate(index = parse_number(key)) %>% select(-key) %>%
  group_by(index) %>%
  summarise(lower=quantile(value, 0.025),
            upper=quantile(value, 0.975),
            mean=mean(value)) %>%
  cbind(ts) %>%
  mutate(well = factor(names(ids[well_id_ts])),
         date_numeric = date_numeric_ts)

# actual observations
tsplotdata = dry_df %>%
  filter(well_id %in% ids[tsplotwells]) %>%
  mutate(datetime = factor(as.Date(date)),
         facility = well)

# experimental AR1 time series
ar_fit = as.data.frame(outmatrix) %>%
  select(contains("mu_ar")) %>%
  gather() %>%
  mutate(date_numeric = as.numeric(str_extract(key, "(?<=\\[)(.*?)(?=,)")) + min(dry_df$date_numeric) - 1,
         facility = names(ids)[as.numeric(str_extract(key, "(?<=,)(.*?)(?=\\])"))]) %>%
  select(facility, date_numeric, value) %>%
  group_by(facility, date_numeric) %>%
  summarise(mean=mean(value),
            lower=quantile(value, 0.025),
            upper=quantile(value, 0.975)) %>%
  filter(facility %in% tsplotwells)

# experimental EMA time series
ewma_fit = as.data.frame(outmatrix) %>%
  select(contains("mu_ema")) %>%
  gather() %>%
  mutate(date_numeric = as.numeric(str_extract(key, "(?<=\\[)(.*?)(?=,)")) + min(dry_df$date_numeric) - 1,
         facility = names(ids)[as.numeric(str_extract(key, "(?<=,)(.*?)(?=\\])"))]) %>%
  select(facility, date_numeric, value) %>%
  group_by(facility, date_numeric) %>%
  summarise(mean=mean(value),
            lower=quantile(value, 0.025),
            upper=quantile(value, 0.975)) %>%
  filter(facility %in% tsplotwells)

# find plot limits
tsmax = max(c(ts_fit$upper, ar_fit$upper, ewma_fit$upper))

lintsplot = ggplot(ts_fit, aes(x=date_numeric, color=well, fill=well)) +
  geom_line(aes(y=mean), linetype="dashed") +
  geom_ribbon(aes(ymin=lower, ymax=upper), color=NA, alpha=0.25) +
  geom_line(data=tsplotdata, aes(y=mf)) +
  geom_vline(aes(xintercept = max(tsplotdata$date_numeric)), linetype="dashed", color="red") +
  coord_cartesian(ylim=c(0, 60)) +
  labs(title=paste("Linear Time Series Regression for Selected Wells in PI"), x="Days since baseline (2000)", linetype="")# +
  # ggsave('../_media/dry_time_series.png', width=24.7, height=8, units='cm')

arplot = ggplot(ar_fit %>% filter(facility %in% tsplotwells), aes(x=date_numeric, y=mean, fill=facility, color=facility)) +
  geom_line(data=tsplotdata, aes(y=mf)) +
  geom_ribbon(aes(ymin=lower, ymax=upper), color=NA, alpha=0.5) +
  geom_line(linetype="dashed") + coord_cartesian(ylim=c(0, 60)) +
  geom_vline(aes(xintercept = max(tsplotdata$date_numeric)), linetype="dashed", color="red") +
  labs(title="AR(1) Experiment", x="Days since first date", y="Mass flow (T/h)") +
  ggsave('../_media/ar_experiment.png', width=24.7, height=8, units='cm')

ewmaplot = ggplot(ewma_fit, aes(x=date_numeric, y=mean, fill=facility, color=facility)) +
  geom_line(data=tsplotdata, aes(y=mf)) +
  geom_ribbon(aes(ymin=lower, ymax=upper), color=NA, alpha=0.5) +
  geom_line(linetype="dashed") + coord_cartesian(ylim=c(0, 60)) +
  geom_vline(aes(xintercept = max(tsplotdata$date_numeric)), linetype="dashed", color="red") +
  labs(title="EWMA Experiment", x="Days since first date")# +
  # ggsave('../_media/ewma_experiment.png', width=24.7, height=8, units='cm')

ggplotly(lintsplot)
ggplotly(arplot)
ggplotly(ewmaplot)
tsgrob = grid_arrange_shared_legend(lintsplot, arplot, ewmaplot, nrow=3, ncol=1, position = "bottom")

tsgrob
## TableGrob (2 x 1) "arrange": 2 grobs
##   z     cells    name              grob
## 1 1 (1-1,1-1) arrange   gtable[arrange]
## 2 2 (2-2,1-1) arrange gtable[guide-box]
# ggsave('../_media/ts_experiment.png', tsgrob, width=24.7, height=24, units='cm')

4.6.4 Goodness of fit (OLS regression)

liq_fit = as.data.frame(outmatrix) %>%
  select(contains('mf_fit')) %>%
  gather(key='index', value='fitted') %>%
  mutate(index=as.integer(parse_number(index))) %>%
  group_by(index) %>%
  summarise(lower=quantile(fitted, 0.025),
            upper=quantile(fitted, 0.975),
            Fitted=mean(fitted),
            std=sd(fitted)) %>%
  cbind(regression_df) %>%
  mutate(`Standardised residual` = (Fitted-mf)/std,
         Well = factor(names(ids[well_id])),
         Observed = mf) %>%
  gather(key="key", value="value", `Standardised residual`, Observed) %>%
  select(Well, key, Fitted, value, source)

diagplot = ggplot(liq_fit, aes(x=Fitted, y=value)) +
  geom_point(aes(color=Well, shape=Well)) + scale_shape_manual(values = rep_len(1:25, length(unique(liq_fit$Well)))) +
  geom_smooth(color='black') +
  facet_wrap(~key, scales="free") +
  geom_hline(data=data.frame(key="Standardised residual", value=c(1.96,-1.96)), aes(yintercept=value), color='red') +
  geom_abline(data=data.frame(key="Observed", a = 1, b = 0), aes(slope = a, intercept=b), color='red') +
  # coord_cartesian(ylim=c(-4, 4)) +
  labs(title="Diagnostic Plots", x="Fitted mass flow (T/h)", y="") +
  theme(legend.position = "bottom") +
  guides(color=guide_legend(nrow=3, byrow=T), shape=guide_legend(nrow=3, byrow=T))# +
  # ggsave('../_media/diagnostics.png', width=24.7, height=12, units='cm')
ggplotly(diagplot)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
selectwells = liq_fit %>% group_by(Well, key) %>% summarise(fittedsd = sd(Fitted)) %>%
  arrange(desc(fittedsd)) %>% head(56*2) %>% pull(Well)

observedplot = ggplot(liq_fit %>% filter(key=="Observed", Well %in% selectwells), aes(x=Fitted, y=value)) +
  geom_point(aes(color=source), alpha=0.5) +
  geom_smooth(color=NA, alpha=0.5) +
  facet_wrap(~Well, scales="free") +
  geom_abline(data=data.frame(key="Observed", a = 1, b = 0), aes(slope = a, intercept=b)) +
  labs(title="Linear Regression Fit Plots Per Well", x="Fitted mass flow (T/h)", y="Observed mass flow (T/h)", color="Data source") +
  theme(legend.position = "bottom")# +
  # guides(color=guide_legend(nrow=3, byrow=T), shape=guide_legend(nrow=3, byrow=T)) +
  # ggsave('../_media/observed.png', width=24.7, height=24.7, units='cm')

stdresplot = ggplot(liq_fit %>% filter(key=="Standardised residual", Well %in% selectwells), aes(x=Fitted, y=value)) +
  geom_point(aes(color=source), alpha=0.5) +
  geom_smooth(color=NA, alpha=0.5) +
  facet_wrap(~Well, scales="free_x") +
  geom_hline(data=data.frame(key="Standardised residual", value=c(1.96,-1.96)), aes(yintercept=value), color='red') +
  # geom_abline(data=data.frame(key="Observed", a = 1, b = 0), aes(slope = a, intercept=b), color='red') +
  labs(title="Linear Regression Residual Plots Per Well", x="Fitted mass flow (T/h)", y="Standardised residual", color="Data source") +
  coord_cartesian(ylim=c(-5, 5)) + theme(legend.position="bottom")# +
  # guides(color=guide_legend(nrow=3, byrow=T), shape=guide_legend(nrow=3, byrow=T)) +
  # ggsave('../_media/stdres.png', width=24.7, height=24.7, units='cm')

ggplotly(observedplot)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
ggplotly(stdresplot)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
# stdres_min = liq_fit %>% filter(key=="Standardised residual") %>% pull(value) %>% min()
# stdres_max = liq_fit %>% filter(key=="Standardised residual") %>% pull(value) %>% max()
# ggplot(liq_fit %>% filter(key=="Standardised residual"), aes(x=value)) +
#   geom_density(fill="red", alpha=0.5, color=NA) +
#   geom_line(data=data.frame(x=seq(stdres_min, stdres_max, length.out=100)), aes(x=x, y=dnorm(x)))

4.6.5 Limits and Constraint Violations

sf.df <- outframe %>% 
  filter(str_detect(variable, "total_sf") & value > 0) %>% 
  droplevels()
limits = fp_constants %>%
  mutate(facility = names(ids)[fp_id]) %>%
  select(facility, limit) %>% 
  drop_na()

p.limits = sf.df %>%
  left_join(limits, by=c("facility")) %>%
  mutate(greater = value > limit) %>%
  group_by(facility) %>%
  summarise(p.greater = mean(greater)) %>%
  drop_na()

limitplot = ggplot(sf.df %>% filter(facility %ni% incomplete.fps), aes(x=value, fill=facility)) +
  facet_wrap(~facility, scales = "free_y", ncol=2) +
  geom_density(alpha=0.5, color=NA) +
  geom_vline(data=limits, aes(xintercept=limit), color="red") +
  geom_label(data=p.limits %>% filter(facility %ni% incomplete.fps), aes(x=-Inf, y=Inf, hjust=0, vjust=1, label=paste0("p(>lim)=", p.greater), family="Times New Roman"), color="black", label.size=0, fill='white') +
  theme(legend.position="none",
        axis.text.y=element_blank(),
        axis.ticks.y=element_blank()) +
  labs(title="Posterior Flash Plant Mass Flows", x="Steam flow (T/h)", y="Density", fill="Flash plant", color="Steam flow limit")# +
  # ggsave('../_media/constraints.png', width=24.7, height=10, units='cm')
ggplotly(limitplot)

4.6.6 Flow Comparison

flow.df <- outframe %>% 
  filter(facility %in% fp_names) %>%
  filter(str_detect(variable, "mf_pred|ip_sf|lp_sf|wf") & value > 0) %>%
  mutate(variable=ifelse(variable=="mf_pred", "mf", variable),
         variable=factor(variable, levels=c("mf", "ip_sf", "lp_sf", "wf")))

comparison = fp_constants %>% select("fp", contains("verification")) %>%
  rename(facility=fp) %>%
  gather(key="variable", value="value", -facility) %>%
  mutate(variable = gsub("^verification_", "", variable),
         variable=factor(variable, levels=c("mf", "ip_sf", "lp_sf", "wf"))) %>%
  drop_na()

ps = flow.df %>%
  left_join(comparison, by=c("facility", "variable")) %>%
  mutate(greater = value.x > value.y) %>%
  group_by(facility, variable) %>%
  summarise(p.greater = mean(greater)) %>%
  mutate(variable=factor(variable, levels=c("mf", "ip_sf", "lp_sf", "wf"))) %>%
  drop_na()

verificationplot = ggplot(flow.df %>% filter(facility %ni% incomplete.fps), aes(x=value)) +
  geom_density(aes(y=..scaled.., fill=variable, color=variable), alpha=0.5, show.legend=F) +
  geom_vline(data=comparison %>% filter(facility %ni% incomplete.fps), aes(xintercept=value)) +
  geom_label(data=ps %>% filter(facility %ni% incomplete.fps), aes(x=-Inf, y=Inf, hjust=0, vjust=1, label=paste0("p(>x)=", p.greater), family="Times New Roman"), label.size=0) +
  facet_grid(facility~variable, scales="free", space="free_y") +
  theme(axis.text.y=element_blank(),
        axis.ticks.y=element_blank()) +
  labs(x="Value", y="Scaled density", title="Comparison Between Predicted FP Flows and Sample Data")# +
  # ggsave('../_media/verification.png', width=24.7, height=20, units='cm')
ggplotly(verificationplot)